###############################################################################
# Data Descriptive Plots
###############################################################################
# This Script contains the code to build the descriptive figures
###############################################################################
# Content
###############################################################################
# 1) Dependencies
# 2) Load Data
# 3) Aggregation for Fig. 1
# 4) Fig. 1
# 5) Aggregation for Fig. 2
# 6) Fig. 2
# 7) Aggregation for Fig. 3
# 8) Fig. 3
###############################################################################
# 1) Dependencies
###############################################################################
library(readr)
library(dplyr)
library(ggplot2)
library(gganimate)
library(ggeffects)
library(ggExtra)
library(ggridges)
library(ggrepel)
library(grid)
library(scales)
library(lubridate)
library(extrafont)
library(reshape2)
library(here)
library(ggforce)
library(png)
library(readxl)
library(grid)
library(gridExtra)
library(ggpubr)
library(ggalt)
library(stringr)
###############################################################################
# 2) Load Data
###############################################################################
# Set Path
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
rm(list=ls())

# Custom functions
# ggplot rescale x axis....
scale_x_reordered <- function(..., sep = "___") {
  reg <- paste0(sep, ".+$")
  ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)
}
# ggplot order over facets...
reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) {
  new_x <- paste(x, within, sep = sep)
  stats::reorder(new_x, by, FUN = fun)
}

suppressWarnings(source('ggplot_theme_ddl.R', encoding = "UTF-8"))

df <- readRDS("../data/smd_ner_2015_2019_combined.RDS")
candidates_list_15 <- read.csv('../support/candidates-2015/00-Named_Entity_List_withID.csv', stringsAsFactors = F) %>% 
  as_tibble %>% mutate(id=as.character(id))
candidates_list_19 <- read.csv('../support/candidates-2019/00-Named_Entity_List_withID.csv', stringsAsFactors = F) %>% 
  as_tibble %>% mutate(id=as.character(id))

candidates_list_19 <- candidates_list_19 %>% mutate(candidacy = as.character(gsub("\\s", " ", council))) %>% 
  mutate(council = case_when(candidacy %in% c("SR", "Former Staenderat", "Former Staenderat") ~ "sr",
                             candidacy %in% c("NR", "Former Nationalrat", "Former Nationalrat") ~  "nr",
                             candidacy %in% c("SR und NR", "NR und SR") ~ "sr & nr")) %>% 
  dplyr::select(-c(candidacy))

df$year <- as.character(df$year)
df$date <- format(as.Date(df$date, "%m-%d"), format = "%m-%d")
df$fullname <- ifelse(df$fullname == "Adèle Goumaz", "Adèle Thorens Goumaz", df$fullname)
df$fullname <- ifelse(df$fullname == "Niklaus-Samuel Gugger", "Nik Gugger", df$fullname)
df$incumbent <- ifelse(df$fullname == "Philipp Müller", 1, df$incumbent)

#-----------------------------------------------------------------------------#
# Configurations
#-----------------------------------------------------------------------------#
# Remove Federal Councilors
council <- TRUE
# Remove Party Presidents
president <- TRUE

# Unwanted Topics
unwanted_topics <- c('PoliticalSystem', 'Other_unclassified_Political_Texts', 
                     'NotPolitical', 'Not Classified', 'Other_Problems')

# Council members 2015
council_15 <- c("Ueli Maurer", "Alain Berset", "Didier Burkhalter", 
                "Simonetta Sommaruga", "Eveline Widmer Schlumpf", 
                "Johann Schneider-Ammann", "Doris Leuthard")

# Council members 2019
council_19 <- c("Ueli Maurer", "Alain Berset", "Ignazio Cassis", 
                "Simonetta Sommaruga", "Guy Parmelin",
                "Karin Keller-Sutter", "Viola Amherd")

# Party Presidents 2015
presi_15 <- c("Toni Brunner", "Christian Levrat", "Philipp Müller", 
              "Christophe Darbellay", "Regula Rytz", "Martin Bäumle", "Martin Landolt")

# Party Presidents 2019
presi_19 <- c("Albert Rösti", "Christian Levrat", "Petra Gössi", 
              "Gerhard Pfister", "Regula Rytz", "Jürg Grossen", "Martin Landolt")


# Remove Council Members:
if(council == T){
  df <- df %>% dplyr::filter((year == "2015" & !fullname %in% council_15) |
                               (year == "2019" & !fullname %in% council_19))
}


# Remove Party Presidents:
if(president == T){
  df <- df %>% dplyr::filter((year == "2015" & !fullname %in% presi_15) |
                               (year == "2019" & !fullname %in% presi_19))
}

# Generate Hits with Persons with no hits at all:
df_hits_f <- df %>% group_by(year, date, fullname, person.id, gender, party, canton, list_place_1, age, incumbent, selectsclass, council) %>% summarise(n.hits = n())
unique(df_hits_f$council)

# Check if council NA are just the articles with no mention at all:
help_council_na <- df_hits_f %>% filter(is.na(council) == T)
unique(help_council_na$fullname)

# Add Candidates with zero mentions via the lists of candidates:
names(df_hits_f)
sapply(df_hits_f, mode)

helper <- filter(df_hits_f, year == "2015")
candidates_list_15 <- candidates_list_15 %>% filter(!id %in% helper$person.id)

helper <- filter(df_hits_f, year == "2019")
candidates_list_19 <- candidates_list_19 %>% filter(!id %in% helper$person.id)


candidates_list_15 <- candidates_list_15 %>% dplyr::select(c(age,district,fullname,gender,incumbent,list_place_1,party,id,council)) %>% 
  dplyr::mutate(person.id = id,
                canton = district,
                n.hits = 0,
                selectsclass = NA,
                year = "2015",
                date = "01-01") %>% 
  dplyr::select(-c("district","id"))



candidates_list_19 <- candidates_list_19 %>% dplyr::select(c(age,district,fullname,gender,incumbent,list_place_1,party,id,council)) %>% 
  dplyr::mutate(person.id = id,
                canton = district,
                n.hits = 0,
                selectsclass = NA,
                year = "2019",
                date = "01-01") %>% 
  dplyr::select(-c("district","id"))

#Check if we have candidates where we don't know the chamber:
unique(candidates_list_15$council)
unique(candidates_list_19$council)

df_hits_f <- dplyr::bind_rows(df_hits_f,candidates_list_15,candidates_list_19)

df_hits_15 <- df_hits_f %>% filter(year == "2015") %>% group_by(fullname, person.id, gender, party, canton, list_place_1, age, incumbent, council) %>%  
  tidyr::complete(selectsclass = unique(df_hits_f$selectsclass), date = unique(df_hits_f$date), 
                                                                 fill = list(n.hits = 0,
                                                                             year = "2015"))
# Complete Selectsclass for each Person in each Year
df_hits_19 <- df_hits_f %>% filter(year == "2019") %>% group_by(fullname, person.id, gender, party, canton, list_place_1, age, incumbent, council) %>%  
  tidyr::complete(selectsclass = unique(df_hits_f$selectsclass), date = unique(df_hits_f$date),
                                                                 fill = list(n.hits = 0,
                                                                             year = "2019"))
# Sanity Check
length(unique(df_hits_19$person.id))
length(unique(df_hits_15$person.id))

# Combine the completed dfs
df_hits_f <- dplyr::bind_rows(df_hits_15,df_hits_19)

# Remove the Class NA
df_hits_f <- df_hits_f %>% filter(is.na(selectsclass) == F) %>% filter(is.na(person.id) == F)

# Sanity Check 
sanity <- df_hits_f %>% group_by(year, selectsclass) %>% summarise(n = n())
sanity

rm(df_hits_19,df_hits_15)
###############################################################################
# 3) Aggregation for Fig. 1
###############################################################################
names(df)

# No need for Adding missing Females with no mentions as it is a shre thats reported
agg1 <- df %>% group_by(year, date, gender, doc.id) %>% 
               summarise(n = n()) %>%
               ungroup() %>% 
               group_by(year,date) %>% 
               mutate(sum_day = sum(n)) %>%
               ungroup %>%
               group_by(year,date,gender) %>% 
               mutate(n_2 = n()) %>%
               ungroup %>% 
               group_by(year, date) %>%
               mutate(freq = n_2 / sum_day,
                      perc = (n_2 / sum_day) * 100) %>% 
               dplyr::select(-c(doc.id,n)) %>%
               distinct(.,.keep_all = T) %>%
               filter(gender %in% c("f"))

agg1$date <- as.Date(agg1$date, "%m-%d")
###############################################################################
# 4) Fig. 1
###############################################################################
# Values for colors
values_year <- c("2019" = "#DD2461", "2015" = "#7D7D7C")

f1 <- ggplot(agg1, aes(y = freq, x=date, color = year )) +
        geom_path(lwd=0.5) +
        geom_point(size = .2) +
        scale_y_continuous(labels = scales::percent_format(accuracy=.1), expand = c(0, 0)) +
        scale_x_date(breaks = "1 month", labels = date_format("%b"), expand = c(0, 0)) + 
        ddl_theme() +
        #facet_wrap(~year, ncol = 1) +
        labs(title = 'Share of Articles one or more Female Candidates got mentioned in',
             y = 'Share [%]', x = 'Days', color = "Year:") + 
  scale_color_manual(name="Year", values = values_year, labels = c("2015", "2019")) +
  theme(legend.position = "bottom", legend.direction = "horizontal",
        strip.background = element_blank(), strip.text = element_text(color = "black"),
        axis.text.x = element_text(angle = 0, hjust = .5, vjust = 1, size = 16),  
        axis.text.y = element_text(hjust=.5, size = 16),
        strip.text.x = element_text(size = 16),
        axis.title = element_text(size = 16),
        plot.title = element_text(size = 20),
        legend.text = element_text(size = 16),
        plot.margin = unit(c(.5,1.3,.5,.5), "cm"),
        legend.key.size = unit(1.5,"line"),
        axis.line.x = element_line(color="black", size = .5),
        axis.line.y = element_line(color="black", size = .5),
        panel.spacing.x=unit(2.5, "lines"))

f1 

ggsave(plot = f1, filename ='../img/descriptive_fig_1a_share_mentions_female.png',width=16, height=12, dpi = 300) 

# Values for colors
values_year <- c("2019" = "#DD2461", "2015" = "#7D7D7C")

f1 <- ggplot(agg1, aes(y = freq, x=date, color = year )) +
  geom_smooth(formula = "y ~ x",method = "loess", se = F, n = 610, span = .5, size = 0.5) +
  #geom_point(size = .2, alpha = .2) +
  scale_y_continuous(labels = scales::percent_format(accuracy=.1), expand = c(0, 0)) +
  scale_x_date(breaks = "1 month", labels = date_format("%b"), expand = c(0, 0)) + 
  ddl_theme() +
  #facet_wrap(~year, ncol = 1) +
  labs(title = 'Share of Articles one or more Female Candidates got mentioned in',
       y = 'Share [%]', x = 'Days', color = "Year:") + 
  scale_color_manual(name="Year", values = values_year, labels = c("2015", "2019")) +
  theme(legend.position = "bottom", legend.direction = "horizontal",
        strip.background = element_blank(), strip.text = element_text(color = "black"),
        axis.text.x = element_text(angle = 0, hjust = .5, vjust = 1, size = 16),  
        axis.text.y = element_text(hjust=.5, size = 16),
        strip.text.x = element_text(size = 16),
        axis.title = element_text(size = 16),
        plot.title = element_text(size = 20),
        legend.text = element_text(size = 16),
        plot.margin = unit(c(.5,1.3,.5,.5), "cm"),
        legend.key.size = unit(1.5,"line"),
        axis.line.x = element_line(color="black", size = .5),
        axis.line.y = element_line(color="black", size = .5),
        panel.spacing.x=unit(2.5, "lines"))

f1 

ggsave(plot = f1, filename ='../img/descriptive_fig_1b_share_mentions_female.png',width=16, height=12, dpi = 300) 
###############################################################################
# 5) Aggregation for Fig. 2
###############################################################################
names(df)

# Same as before no need for addition of missing feamles and males in data
agg2 <- df %>% group_by(year, date, gender) %>% 
  summarise(n = n()) %>% 
  filter(is.na(gender) == F) %>% 
  ungroup %>% 
  group_by(year, date) %>%
  mutate(freq = n / sum(n),
         perc = (n / sum(n)) * 100) %>%
  filter(gender %in% c("f"))

agg2$date <- as.Date(agg2$date, "%m-%d")
###############################################################################
# 6) Fig. 2
###############################################################################
# Values for colors
values_year <- c("2019" = "#DD2461", "2015" = "#7D7D7C")

f2 <- ggplot(agg2, aes(y = freq, x=date, color = year)) +
  geom_path(lwd=0.5) +
  geom_point(size = .2) +
  geom_hline(yintercept = .345, color = "#7D7D7C", linetype = "dashed") +
  geom_hline(yintercept = .40, color = "#DD2461", linetype = "dotted") +
  scale_y_continuous(labels = scales::percent_format(accuracy=.1), expand = c(0, 0)) +
  scale_x_date(breaks = "1 month", labels = date_format("%b"), expand = c(0, 0)) + 
  ddl_theme() +
  labs(title = 'Share of Mentions by Women over all Mentions',
       y = 'Anteil Artikel', x = 'Days', color = "Year:") + 
  scale_color_manual(name="Year", values = values_year, labels = c("2015", "2019")) +
  theme(legend.position = "bottom", legend.direction = "horizontal",
        strip.background = element_blank(), strip.text = element_text(color = "black"),
        axis.text.x = element_text(angle = 0, hjust = .5, vjust = 1, size = 16),  
        axis.text.y = element_text(hjust=.5, size = 16),
        strip.text.x = element_text(size = 16),
        axis.title = element_text(size = 16),
        plot.title = element_text(size = 20),
        legend.text = element_text(size = 16),
        plot.margin = unit(c(.5,1.3,.5,.5), "cm"),
        legend.key.size = unit(1.5,"line"),
        axis.line.x = element_line(color="black", size = .5),
        axis.line.y = element_line(color="black", size = .5),
        panel.spacing.x=unit(2.5, "lines"))

f2 

ggsave(plot = f2, filename='../img/descriptive_fig_2a_share_of_mentions_m_vs_f.png',width=16, height=12, dpi = 300) 


# Values for colors
values_year <- c("2019" = "#DD2461", "2015" = "#7D7D7C")

f2 <- ggplot(agg2, aes(y = freq, x=date, color = year)) +
  geom_smooth(formula = "y ~ x",method = "loess", se = F, n = 610, span = .5, size = 0.5) +
  #geom_point(size = .2, alpha = .2) +
  geom_hline(yintercept = .345, color = "#7D7D7C", linetype = "dashed") +
  geom_hline(yintercept = .40, color = "#DD2461", linetype = "dotted") +
  scale_y_continuous(labels = scales::percent_format(accuracy=.1), expand = c(0, 0)) +
  scale_x_date(breaks = "1 month", labels = date_format("%b"), expand = c(0, 0)) + 
  ddl_theme() +
  labs(title = 'Share of Mentions by Women over all Mentions',
       y = 'Anteil Artikel', x = 'Days', color = "Year:") + 
  scale_color_manual(name="Year", values = values_year, labels = c("2015", "2019")) +
  theme(legend.position = "bottom", legend.direction = "horizontal",
        strip.background = element_blank(), strip.text = element_text(color = "black"),
        axis.text.x = element_text(angle = 0, hjust = .5, vjust = 1, size = 16),  
        axis.text.y = element_text(hjust=.5, size = 16),
        strip.text.x = element_text(size = 16),
        axis.title = element_text(size = 16),
        plot.title = element_text(size = 20),
        legend.text = element_text(size = 16),
        plot.margin = unit(c(.5,1.3,.5,.5), "cm"),
        legend.key.size = unit(1.5,"line"),
        axis.line.x = element_line(color="black", size = .5),
        axis.line.y = element_line(color="black", size = .5),
        panel.spacing.x=unit(2.5, "lines"))

f2 

ggsave(plot = f2, filename='../img/descriptive_fig_2b_share_of_mentions_m_vs_f.png',width=16, height=12, dpi = 300) 

# Values for colors
values_year <- c("2019" = "#DD2461", "2015" = "#7D7D7C")
# Works since we normalize for the actual share of Femalis of all female candidates (much simpler to calculate)
agg2$freq_norm <- ifelse(agg2$year == "2015", agg2$freq - .345, agg2$freq - .40)


f2 <- ggplot(agg2, aes(y = freq_norm, x=date, color = year)) +
  geom_path(lwd=0.5) +
  geom_point(size = .2) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey", size = .2) +
  scale_y_continuous(labels = scales::percent_format(accuracy=.1), expand = c(0, 0)) +
  scale_x_date(breaks = "1 month", labels = date_format("%b"), expand = c(0, 0)) + 
  ddl_theme() +
  labs(title = 'Normalized Mentions of Women',
       y = 'Deviation from actual Share in Candidates List [%]', x = 'Days', color = "Year:") + 
  scale_color_manual(name="Year", values = values_year, labels = c("2015", "2019")) +
  theme(legend.position = "bottom", legend.direction = "horizontal",
        strip.background = element_blank(), strip.text = element_text(color = "black"),
        axis.text.x = element_text(angle = 0, hjust = .5, vjust = 1, size = 16),  
        axis.text.y = element_text(hjust=.5, size = 16),
        strip.text.x = element_text(size = 16),
        axis.title = element_text(size = 16),
        plot.title = element_text(size = 20),
        legend.text = element_text(size = 16),
        plot.margin = unit(c(.5,1.3,.5,.5), "cm"),
        legend.key.size = unit(1.5,"line"),
        axis.line.x = element_line(color="black", size = .5),
        axis.line.y = element_line(color="black", size = .5),
        panel.spacing.x=unit(2.5, "lines"))

f2 

ggsave(plot = f2, filename='../img/descriptive_fig_2c_normalized_mentions_women.png',width=16, height=12, dpi = 300) 

# Values for colors
values_year <- c("2019" = "#DD2461", "2015" = "#7D7D7C")

f2 <- ggplot(agg2, aes(y = freq_norm, x=date, color = year)) +
  geom_smooth(formula = "y ~ x",method = "loess", se = F, n = 610, span = .5, size = 0.5) +
  #geom_point(size = .2, alpha = .2) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey", size = .2) +
  scale_y_continuous(labels = scales::percent_format(accuracy=.1), expand = c(0, 0), breaks = c(-0.125,-0.100,-0.075,-0.050,-0.025,0,0.025,0.050,0.075)) +
  scale_x_date(breaks = "1 month", labels = date_format("%b"), expand = c(0, 0)) + 
  ddl_theme() +
  labs(title = 'Normalized Mentions of Women',
       y = 'Deviation from actual Share in Candidates List [%]', x = 'Days', color = "Year:") + 
  scale_color_manual(name="Year", values = values_year, labels = c("2015", "2019")) +
  theme(legend.position = "bottom", legend.direction = "horizontal",
        strip.background = element_blank(), strip.text = element_text(color = "black"),
        axis.text.x = element_text(angle = 0, hjust = .5, vjust = 1, size = 16),  
        axis.text.y = element_text(hjust=.5, size = 16),
        strip.text.x = element_text(size = 16),
        axis.title = element_text(size = 16),
        plot.title = element_text(size = 20),
        legend.text = element_text(size = 16),
        plot.margin = unit(c(.5,1.3,.5,.5), "cm"),
        legend.key.size = unit(1.5,"line"),
        axis.line.x = element_line(color="black", size = .5),
        axis.line.y = element_line(color="black", size = .5),
        panel.spacing.x=unit(2.5, "lines"))

f2 

ggsave(plot = f2, filename='../img/descriptive_fig_2d_normalized_mentions_women.png',width=16, height=12, dpi = 300)
###############################################################################
# 7) Aggregation for Fig. 3
###############################################################################
names(df)

# filter out classes that will not influence the ranking
dfnew <- filter(df, !selectsclass%in% c('PoliticalSystem', 'Other_unclassified_Political_Texts', 'NotPolitical', 'Not Classified', 'Other_Problems', 'Regions_NationalCohesion'))

# recode the classes to German
vec <- sort(unique(dfnew$selectsclass))
.recodr <- list('Agriculture', 'Economy', 'Education & Culture', 'Environment', 'Europe', 
                'Finance & Taxes', 'Gender', 'Immigration', 
                'International Relations',
                'Labour Market', 'Law & Order', 'Public Health',
                'Services & Infrastructure',
                'Social Security') %>% 
  setNames(., vec)

.recodrnobreaks <- list('Agriculture', 'Economy', 'Education & Culture', 'Environment', 'Europe', 
                        'Finance & Taxes', 'Gender', 'Immigration', 
                        'International Relations',
                        'Labour Market', 'Law & Order', 'Public Health',
                        'Services & Infrastructure',
                        'Social Security') %>% 
  setNames(., vec)

agg3 <- dfnew %>% group_by(year, date, selectsclass, gender) %>% 
  summarise(n = n()) %>% 
  filter(is.na(gender) == F) %>% 
  ungroup %>% 
  group_by(year, date, selectsclass) %>%
  mutate(freq = n / sum(n),
         perc = (n / sum(n)) * 100) %>%
  filter(gender %in% c("f"))

agg3 <- agg3 %>% ungroup %>% group_by(year, selectsclass, gender) %>% 
                 tidyr::complete(date = unique(df$date), fill = list(n = 0,
                                                                     freq = 0,
                                                                     n_tot = NA))


agg3$date <- as.Date(agg3$date, "%m-%d")
###############################################################################
# 8) Fig. 3
###############################################################################
# Values for colors
values_year <- c("2019" = "#DD2461", "2015" = "#7D7D7C")

f3 <- ggplot(agg3, aes(y = freq, x=date, color = year)) +
  geom_path(lwd=0.5) +
  geom_point(size = .2) +
  geom_hline(yintercept = .345, color = "#7D7D7C", linetype = "dashed") +
  geom_hline(yintercept = .40, color = "#DD2461", linetype = "dotted") +
  scale_y_continuous(labels = scales::percent_format(accuracy=.1), expand = c(0, 0)) +
  scale_x_date(breaks = "2 month", labels = date_format("%b"), expand = c(0, 0)) + 
  facet_wrap(~selectsclass, ncol = 3) +
  ddl_theme() +
  labs(title = 'Share of Mentions by Women over all Mentions',
       y = 'Share of Articles [%]', x = 'Days', color = "Year:") + 
  scale_color_manual(name="Year", values = values_year, labels = c("2015", "2019")) +
  theme(legend.position = "bottom", legend.direction = "horizontal",
        strip.background = element_blank(), strip.text = element_text(color = "black"),
        axis.text.x = element_text(angle = 0, hjust = .5, vjust = 1, size = 16),  
        axis.text.y = element_text(hjust=.5, size = 16),
        strip.text.x = element_text(size = 16),
        axis.title = element_text(size = 16),
        plot.title = element_text(size = 20),
        legend.text = element_text(size = 16),
        plot.margin = unit(c(.5,1.3,.5,.5), "cm"),
        legend.key.size = unit(1.5,"line"),
        axis.line.x = element_line(color="black", size = .5),
        axis.line.y = element_line(color="black", size = .5),
        panel.spacing.x=unit(2.5, "lines"))

f3 

ggsave(plot = f3, filename='../img/descriptive_fig_3a_share_of_mentions_with_female_topics.png',width=16, height=25, dpi = 300) 


# Values for colors
values_year <- c("2019" = "#DD2461", "2015" = "#7D7D7C")

f3 <- ggplot(agg3, aes(y = freq, x=date, color = year)) +
  geom_smooth(formula = "y ~ x",method = "loess", se = F, n = 610, span = .5, size = 0.5) +
  #geom_point(size = .2, alpha = .2) +
  geom_hline(yintercept = .345, color = "#7D7D7C", linetype = "dashed") +
  geom_hline(yintercept = .40, color = "#DD2461", linetype = "dotted") +
  scale_y_continuous(labels = scales::percent_format(accuracy=.1), expand = c(0, 0)) +
  scale_x_date(breaks = "2 month", labels = date_format("%b"), expand = c(0, 0)) + 
  facet_wrap(~selectsclass, ncol = 3) +
  ddl_theme() +
  labs(title = 'Share of Mentions by Women over all Mentions',
       y = 'Share of Articles [%]', x = 'Days', color = "Year:") + 
  scale_color_manual(name="Year", values = values_year, labels = c("2015", "2019")) +
  theme(legend.position = "bottom", legend.direction = "horizontal",
        strip.background = element_blank(), strip.text = element_text(color = "black"),
        axis.text.x = element_text(angle = 0, hjust = .5, vjust = 1, size = 16),  
        axis.text.y = element_text(hjust=.5, size = 16),
        strip.text.x = element_text(size = 16),
        axis.title = element_text(size = 16),
        plot.title = element_text(size = 20),
        legend.text = element_text(size = 16),
        plot.margin = unit(c(.5,1.3,.5,.5), "cm"),
        legend.key.size = unit(1.5,"line"),
        axis.line.x = element_line(color="black", size = .5),
        axis.line.y = element_line(color="black", size = .5),
        panel.spacing.x=unit(2.5, "lines"))

f3 

ggsave(plot = f3, filename='../img/descriptive_fig_3b_share_of_mentions_with_female_topics.png',width=16, height=12, dpi = 300) 
###############################################################################
# 7) Aggregation for Fig. 4
###############################################################################
names(df)

agg4 <- df %>% group_by(year, date, selectsclass, gender, doc.id) %>% 
               summarise(n = n()) %>%
               ungroup() %>% 
               group_by(year,date, selectsclass) %>% 
               mutate(sum_day = sum(n)) %>%
               ungroup %>%
               group_by(year,date,selectsclass, gender) %>% 
               mutate(n_2 = n()) %>% 
               ungroup %>% 
               group_by(year, date, selectsclass) %>%
               mutate(freq = n_2 / sum_day,
                      perc = (n_2 / sum_day) * 100) %>% 
               dplyr::select(-c(doc.id,n)) %>%
               distinct(.,.keep_all = T) %>%
               filter(gender %in% c("f"))

agg4 <- agg4 %>% ungroup %>% group_by(year, selectsclass, gender) %>% 
                 tidyr::complete(date = unique(df$date), fill = list(n = 0,
                                                                     freq = 0,
                                                                     perc = 0))


agg4$date <- as.Date(agg4$date, "%m-%d")

# filter out classes that will not influence the ranking
agg4 <- filter(agg4, !selectsclass%in% c('PoliticalSystem', 'Other_unclassified_Political_Texts', 'NotPolitical', 'Not Classified', 'Other_Problems', 'Regions_NationalCohesion'))

# recode the classes to German
vec <- sort(unique(agg4$selectsclass))
.recodr <- list('Agriculture', 'Economy', 'Education & Culture', 'Environment', 'Europe', 
                'Finance & Taxes', 'Gender', 'Immigration', 
                'International Relations',
                'Labour Market', 'Law & Order', 'Public Health',
                'Services & Infrastructure',
                'Social Security') %>% 
  setNames(., vec)

.recodrnobreaks <- list('Agriculture', 'Economy', 'Education & Culture', 'Environment', 'Europe', 
                        'Finance & Taxes', 'Gender', 'Immigration', 
                        'International Relations',
                        'Labour Market', 'Law & Order', 'Public Health',
                        'Services & Infrastructure',
                        'Social Security') %>% 
  setNames(., vec)

agg4 <- agg4 %>%  mutate(selectsclass=dplyr::recode(selectsclass, !!!.recodr),
                                   selectsclass=factor(selectsclass))

###############################################################################
# 8) Fig. 4
###############################################################################
# Values for colors
values_year <- c("2019" = "#DD2461", "2015" = "#7D7D7C")

f4 <- ggplot(agg4, aes(y = freq, x=date, color = year)) +
  geom_path(lwd=0.5) +
  geom_point(size = .2) +
  scale_y_continuous(labels = scales::percent_format(accuracy=.1), expand = c(0, 0)) +
  scale_x_date(breaks = "2 month", labels = date_format("%b"), expand = c(0, 0)) + 
  facet_wrap(~selectsclass, ncol = 3) +
  ddl_theme() +
  labs(title = 'Share of Articles one or more Female Candidates got mentioned in',
       y = 'Share [%]', x = 'Days', color = "Year:") + 
  scale_color_manual(name="Year", values = values_year, labels = c("2015", "2019")) +
  theme(legend.position = "bottom", legend.direction = "horizontal",
        strip.background = element_blank(), strip.text = element_text(color = "black"),
        axis.text.x = element_text(angle = 0, hjust = .5, vjust = 1, size = 16),  
        axis.text.y = element_text(hjust=.5, size = 16),
        strip.text.x = element_text(size = 16),
        axis.title = element_text(size = 16),
        plot.title = element_text(size = 20),
        legend.text = element_text(size = 16),
        plot.margin = unit(c(.5,1.3,.5,.5), "cm"),
        legend.key.size = unit(1.5,"line"),
        axis.line.x = element_line(color="black", size = .5),
        axis.line.y = element_line(color="black", size = .5),
        panel.spacing.x=unit(2.5, "lines"))

f4 

ggsave(plot = f4, filename='../img/descriptive_fig_4a_share_of_articles_with_female_topics.png',width=16, height=25, dpi = 300) 


# Values for colors
values_year <- c("2019" = "#DD2461", "2015" = "#7D7D7C")

f4 <- ggplot(agg4, aes(y = freq, x=date, color = year)) +
  geom_smooth(formula = "y ~ x",method = "loess", se = F, n = 610, span = .5, size = 0.5) +
  #geom_point(size = .2, alpha = .2) +
  scale_y_continuous(labels = scales::percent_format(accuracy=.1), expand = c(0, 0)) +
  scale_x_date(breaks = "2 month", labels = date_format("%b"), expand = c(0, 0)) + 
  facet_wrap(~selectsclass, ncol = 3) +
  ddl_theme() +
  labs(title = 'Share of Articles one or more Female Candidates got mentioned in',
       y = 'Share [%]', x = 'Days', color = "Year:") + 
  scale_color_manual(name="Year", values = values_year, labels = c("2015", "2019")) +
  theme(legend.position = "bottom", legend.direction = "horizontal",
        strip.background = element_blank(), strip.text = element_text(color = "black"),
        axis.text.x = element_text(angle = 0, hjust = .5, vjust = 1, size = 16),  
        axis.text.y = element_text(hjust=.5, size = 16),
        strip.text.x = element_text(size = 16),
        axis.title = element_text(size = 16),
        plot.title = element_text(size = 20),
        legend.text = element_text(size = 16),
        plot.margin = unit(c(.5,1.3,.5,.5), "cm"),
        legend.key.size = unit(1.5,"line"),
        axis.line.x = element_line(color="black", size = .5),
        axis.line.y = element_line(color="black", size = .5),
        panel.spacing.x=unit(2.5, "lines"))

f4 

ggsave(plot = f4, filename='../img/descriptive_fig_4b_share_of_articles_with_female_topics.png',width=16, height=12, dpi = 300) 

###############################################################################
# 11) Aggregation for Fig. 5
###############################################################################
names(df)

agg5 <- df %>% filter(gender %in% c("f", "m")) %>%
  group_by(year, fullname, gender, party) %>% 
  summarise(n = n()) %>% 
  ungroup %>% group_by(year, gender) %>%
  slice_max(n = 10, order_by = n)

###############################################################################
# 12) Fig. 5
###############################################################################
# party orders, colors
.fill2 <- unlist(colourList[['colour']][['parties']])
names(.fill2) <- toupper(names(.fill2))

.fill2 <- .fill2[unique(agg5$party)]

f5 <- ggplot(agg5, aes(x=reorder_within(fullname, n, year), y=n, color = party, fill = party)) +
      geom_bar(stat = "identity") +
      facet_wrap(gender~year, scales = "free", 
                 labeller= labeller(gender = c(`f` = "Women", `m` = "Men"),
                                    year = c(`2015` = "2015", `2019` = "2019"))) + 
      scale_x_reordered(expand = c(0,0)) + 
      scale_color_manual(values=.fill2) +
      scale_fill_manual(values=.fill2) +
      coord_flip() +
      labs(title = "Total Mentions of Top 10 Candidates by Gender and Year",
           y = "Total number of Mentions", color = "Party:", fill = "Party:") +
      ddl_theme(type = 'default',
                panel.grid.major=element_blank(),
                legend.position='none',
                axis.line.y.left = element_line(colour="black"),
                axis.line.x.bottom = element_line(colour="black")) +
      theme(legend.position = "bottom", legend.direction = "horizontal",
            axis.title.y = element_blank(),
            axis.title.x = element_text(size = 16),
            plot.title = element_text(size = 20),
            legend.text = element_text(size = 16),
            axis.text.x = element_text(angle = 0, hjust = .5, vjust = 1, size = 16),  
            axis.text.y = element_text(hjust=.5, size = 16),
            strip.text.x = element_text(size = 16),
            plot.margin = unit(c(.5,.5,.5,.6), "cm"))

f5

GGsave(f5, filename='../img/descriptive_fig_5_top_candidates_year_gender.png', format='rect', width=12, height=8) 
###############################################################################
# 13) Aggregation for Fig. 6
###############################################################################
dfnew <- df %>% mutate(pubDateTime = ymd(pubDateTime),
                    weekd = week(pubDateTime),
                    yeard = year(pubDateTime))
# isolate first weekdays of every week to aggregate data on weekly level
first_weekday <- dfnew %>% 
  mutate() %>% 
  group_by(yeard, weekd) %>% 
  arrange(pubDateTime) %>% 
  filter(row_number()==1)  %>% 
  select(yeard, weekd, firstday = date)

# join everything back to the initial datasets
dfnew <- left_join(dfnew, first_weekday)  

# filter out classes that will not influence the ranking
dfnew <- filter(dfnew, !selectsclass%in% c('PoliticalSystem', 'Other_unclassified_Political_Texts', 'NotPolitical', 'Not Classified', 'Other_Problems'))

# recode the classes to German
vec <- sort(unique(dfnew$selectsclass))
.recodr <- list('Landwirtschaft', 'Wirtschaft', 'Erziehung &\nKultur', 'Umwelt & Energie', 'EU / Europa', 
                'Finanzen &\nSteuern', 'Geschlechterfragen &\nDiskriminierung', 'Immigration &\nAsyl', 
                'Int. Beziehungen &\nArmee',
                'Arbeitsmarkt', 'Recht &\nOrdnung', 'Gesundheitswesen',
                'Öffentliche Dienste &\nInfrastruktur', 'Regionen &\nnat. Zusammenhalt', 
                'Sozialversicherung /\nSozialstaat') %>% 
  setNames(., vec)

.recodrnobreaks <- list('Landwirtschaft', 'Wirtschaft', 'Erziehung & Kultur', 'Umwelt & Energie', 'EU / Europa', 
                        'Finanzen & Steuern', 'Geschlechterfragen & Diskriminierung', 'Immigration & Asyl', 
                        'Int. Beziehungen & Armee',
                        'Arbeitsmarkt', 'Recht & Ordnung', 'Gesundheitswesen',
                        'Öffentliche Dienste & Infrastruktur', 'Regionen & nat. Zusammenhalt', 
                        'Sozialversicherung / Sozialstaat') %>% 
  setNames(., vec)

# aggregate
aggr <- dfnew %>% 
  group_by(yeard, date, selectsclass) %>% 
  summarise(n=n()) %>% 
  mutate(perc=n/sum(n)) %>% 
  ungroup %>% 
  arrange(desc(perc)) %>% 
  mutate(selectsclass=dplyr::recode(selectsclass, !!!.recodr),
         selectsclass=factor(selectsclass))

aggr <- aggr %>% ungroup %>% group_by(yeard, selectsclass) %>% 
                             tidyr::complete(date = unique(df$date), fill = list(n = 0))

rm(dfnew)
# there might not be data points for all topics for every week
# create expander to make sure that there are NAs for such a case for each topic
.expandr <- expand.grid(selectsclass=unique(aggr$selectsclass), date=unique(df$date))
aggr <- left_join(.expandr, aggr)

aggr$date <- as.Date(aggr$date, format="%m-%d")
aggr$yeard <- as.character(aggr$yeard)
values_year <- c("2019" = "#DD2461", "2015" = "#7D7D7C")

aggr <- aggr %>% filter(is.na(yeard) == F)
###############################################################################
# 14) Fig. 6
###############################################################################
# plot
f6 <- ggplot(aggr, aes(y=perc, x=date, group=selectsclass, color = yeard, fill = yeard)) +
  geom_segment(aes(y=0, yend=0, x=min(date), xend=max(date)), colour='grey') +
  geom_segment(aes(y=.05, yend=.05, x=min(date), xend=max(date)), colour='grey') +
  geom_segment(aes(y=.1, yend=.1, x=min(date), xend=max(date)), colour='grey') +
  geom_segment(aes(y=.15, yend=.15, x=min(date), xend=max(date)), colour='grey') +
  geom_segment(aes(y=.2, yend=.2, x=min(date), xend=max(date)), colour='grey') +
  #geom_segment(aes(y=.2, yend=.2, x=min(aggr$date), xend=max(aggr$date)), colour='grey') +
  geom_path(lwd=1) +
  #geom_text_repel(data=filter(aggr, date==max(date)), aes(y=perc, x=date, label=selectsclass, colour=selectsclass), hjust=-.1, family='Open Sans') +
  scale_y_continuous(expand = c(0.001,0.001), labels = scales::percent_format(accuracy = 1L)) +
  scale_x_date(breaks = "1 month", labels = date_format("%B")) +
  #scale_x_datetime(limits = c(min(aggr$date), max(aggr$date)+ddays(60))) +
  ddl_theme() +
  labs(title = 'Die Themen des Wahljahres 2015 und 2019',
       subtitle = paste0("Anteil Artikel mit dem jeweiligen dominanten Thema pro Woche.\n", 
                         "Zeitungen: 84"),
       y = 'Anteil Artikel',
       x = 'Woche',
       color = "Year:", fill = "Year:"
  ) + 
  scale_color_manual(name="Year", values = values_year) +
  theme(legend.position = "bottom", legend.direction = "horizontal") +
  facet_wrap(~selectsclass)

f6

GGsave(f6, filename='../img/descriptive_fig_6_topics_timeline.png', format='rect', width=10, height=12) 
###############################################################################